Load Packages
library(tidyverse)
library(forecast)
library(ggcorrplot)
library(reshape2)
library(DT)
library(tidymodels)
library(caret)
library(DAAG)
library(plotly)
library(lemon)
library(cluster.datasets)
library(cluster)
library(factoextra)
library(rpart)
library(rpart.plot)Load Data
dcbikeshare <- read_csv("dcbikeshare.csv")
mall_customers <- read_csv("Mall_Customers.csv")
titanic <- read_csv("titanic-1-1.csv")Part 1
Question 1
dcbikeshare <- dcbikeshare %>%
mutate(season = factor(
season,
levels = c(2, 3, 4, 1),
labels = c("spring", "summer", "fall", "winter")
))
dcbikeshare## # A tibble: 731 x 16
## instant dteday season yr mnth holiday weekday workingday weathersit
## <dbl> <date> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2011-01-01 winter 0 1 0 6 0 2
## 2 2 2011-01-02 winter 0 1 0 0 0 2
## 3 3 2011-01-03 winter 0 1 0 1 1 1
## 4 4 2011-01-04 winter 0 1 0 2 1 1
## 5 5 2011-01-05 winter 0 1 0 3 1 1
## 6 6 2011-01-06 winter 0 1 0 4 1 1
## 7 7 2011-01-07 winter 0 1 0 5 1 2
## 8 8 2011-01-08 winter 0 1 0 6 0 2
## 9 9 2011-01-09 winter 0 1 0 0 0 1
## 10 10 2011-01-10 winter 0 1 0 1 1 1
## # ... with 721 more rows, and 7 more variables: temp <dbl>, atemp <dbl>,
## # hum <dbl>, windspeed <dbl>, casual <dbl>, registered <dbl>, cnt <dbl>
Question 2
dcbikeshare <- dcbikeshare %>%
mutate(
holiday = ifelse(holiday == 0, "no", "yes"),
holiday = fct_relevel(holiday, "no", "yes"),
workingday = ifelse(workingday == 0, "no", "yes"),
workingday = fct_relevel(workingday, "no", "yes")
)Question 3
dcbikeshare <- dcbikeshare %>%
mutate(
yr = ifelse(yr == 0, "2011", "2012"),
yr = fct_relevel(yr, "2011", "2012")
)Question 4
dcbikeshare <- dcbikeshare %>%
mutate(weathersit = factor(
weathersit,
levels = 1:4,
labels = c(
"clear",
"mist",
"light precipitation",
"heavy precipitation"
)))Question 6
cnt_dailytemp <- linear_reg() %>%
set_engine("lm") %>%
fit(cnt ~ temp, data = dcbikeshare)
cnt_dailytemp %>%
tidy()## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1215. 161. 7.54 1.43e-13
## 2 temp 6641. 305. 21.8 2.81e-81
cnt_dailytemp %>%
glance()## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.394 0.393 1509. 473. 2.81e-81 1 -6387. 12780. 12793.
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
The model shows that on a day with a temp. of 0 degrees Celsius, there can be an expected 1215 bike rentals, on average. Furthermore, for every degree increase, there will be an increase of 161 bike rentals, as the model’s standard error is approximately 161.
Question 7
cnt_all <- linear_reg() %>%
set_engine("lm") %>%
fit(cnt ~ season + yr + holiday + workingday + weathersit + temp + atemp + hum + windspeed + temp * holiday, data = dcbikeshare)
cnt_all %>%
tidy()## # A tibble: 14 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2717. 268. 10.1 1.36e- 22
## 2 seasonsummer -277. 100. -2.76 5.98e- 3
## 3 seasonfall 410. 96.1 4.27 2.24e- 5
## 4 seasonwinter -1131. 113. -9.96 5.50e- 22
## 5 yr2012 2014. 61.7 32.7 5.12e-144
## 6 holidayyes -1397. 471. -2.97 3.11e- 3
## 7 workingdayyes 120. 67.8 1.77 7.79e- 2
## 8 weathersitmist -420. 81.3 -5.18 2.96e- 7
## 9 weathersitlight precipitation -1908. 207. -9.19 4.03e- 19
## 10 temp 4210. 1394. 3.02 2.62e- 3
## 11 atemp 947. 1522. 0.622 5.34e- 1
## 12 hum -1359. 296. -4.60 5.07e- 6
## 13 windspeed -2722. 435. -6.26 6.54e- 10
## 14 holidayyes:temp 1669. 926. 1.80 7.20e- 2
glance(cnt_all)$adj.r.squared## [1] 0.820292
The adjusted R-squared value for the model is 0.82.
Part 2
Question 1
mall_customers %>%
ggplot(aes(x = Age, y = `Annual Income (k$)`, color = Sex)) +
geom_point(na.rm = TRUE) +
geom_smooth(method = "loess") +
labs(title = "Annual Income vs Sex") +
theme_minimal()## `geom_smooth()` using formula 'y ~ x'
As shown in the scatter plot above, from the age range of 20-40, male annual income is greater than female annual income before they begin trending closer together past the age of 40.
mall_customers %>%
ggplot(aes(x = Age, y = `Spending Score (1-100)`, color = Sex)) +
geom_point(na.rm = TRUE) +
geom_smooth(method = "loess") +
labs(title = "Spending Score vs Age") +
theme_minimal()## `geom_smooth()` using formula 'y ~ x'
From this scatter plot, it can be observed that Spending Score is high in both genders at the younger age range of 20-40, before it trends lower past age 40.
mall_customers_histogram <- mall_customers %>%
ggplot()+
aes(x = `Spending Score (1-100)`, fill = Sex)+
geom_histogram(bins = 50)+
facet_rep_wrap(~Sex, repeat.tick.labels = TRUE)+
labs(title = "Count of Spending Score of Both Genders", y = "Frequency") +
theme_minimal()
ggplotly(mall_customers_histogram)For females, the most observed Spending Score was 42. For males, the most observed Spending Score was 46.
Question 2
set.seed(1)
mall_customers_cluster <- kmeans(mall_customers[,4:5], centers = 5, nstart = 10)
mall_customers_cluster## K-means clustering with 5 clusters of sizes 22, 35, 81, 39, 23
##
## Cluster means:
## Annual Income (k$) Spending Score (1-100)
## 1 25.72727 79.36364
## 2 88.20000 17.11429
## 3 55.29630 49.51852
## 4 86.53846 82.12821
## 5 26.30435 20.91304
##
## Clustering vector:
## [1] 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5 1 5
## [38] 1 5 1 5 1 5 3 5 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [75] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 3 3 3 3 4 2 4 3 4 2 4 2 4 3 4 2 4 2 4 2 4 2 4 3 4 2 4 2 4
## [149] 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2
## [186] 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4
##
## Within cluster sum of squares by cluster:
## [1] 3519.455 12511.143 9875.111 13444.051 5098.696
## (between_SS / total_SS = 83.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Initial clusters that look at Annual Income and Spending Score.
set.seed(1)
options(scipen = 999)
fviz_nbclust( mall_customers[, 4:5],kmeans,method = "wss") +
theme_minimal()Elbow plots like the one above show the ideal number of clusters to be used in a data set by looking at where the elbow curves. In this case, I used 5 for the clusters prior. My recommendation for the correct amount of clusters to be used for this data set is 6.
Question 3
mall_customers_cluster_df5 <- mall_customers %>%
mutate(cluster = as.character(mall_customers_cluster$cluster))
ggplot() +
geom_point(data = mall_customers_cluster_df5, mapping = aes(x = mall_customers_cluster_df5$`Annual Income (k$)`, y = mall_customers_cluster_df5$`Spending Score (1-100)`, color = cluster)) +
geom_point(mapping = aes(x = mall_customers_cluster$centers[,1],y = mall_customers_cluster$centers[,2]), color = "red",size = 3)+
scale_color_discrete(name=" ", breaks=c("1", "2", "3", "4", "5"), labels=c("Spendthrift", "Frugal", "General", "Target", "Careful"))+
labs(title ="K-means Clustering of Mall Customers", x = "Annual Income (k$)", y = "Spending Score")The clustering shows that there are 5 distinct groups of mall customers. This new insight would tell me to start marketing towards the frugal and careful groups. Also, I would continue to market towards the Target group as they are the larger spenders.
Part 3
Question 1
params <- titanic %>%
filter(!is.na(Age)) %>%
summarize(mean = mean(Age), sd = sd(Age))
titanic %>%
ggplot(aes(sample = Age)) +
geom_qq(dparams = params) +
geom_abline() +
theme_minimal()## Warning: Removed 177 rows containing non-finite values (stat_qq).
Using the parameters above, I filtered out the ages of NA and used the mean ages and standard deviation of ages and geom_qq and geom_abline to create a QQ-plot of age distribution of the passengers.
Question 2
titanic <- titanic %>%
mutate(
Survived = ifelse(Survived == 0, "Did Not Survive", "Survived"),
Survived = fct_relevel(Survived, "Did Not Survive", "Survived"),
) %>%
select(-Name, -PassengerId, -Ticket)
set.seed(1)
split <- initial_split(titanic, prop = 0.6)
training_data <- training(split)
validation_data <- testing(split)
set.seed(1)
passenger_tree <- rpart(Survived ~ Age + Pclass + Sex + SibSp + Parch + Fare + Cabin + Embarked, data = training_data, parms = list(split = "gini"), method = "class")
prp(passenger_tree, faclen = 0, varlen =0, cex = 0.75, yesno = 2)printcp(passenger_tree)##
## Classification tree:
## rpart(formula = Survived ~ Age + Pclass + Sex + SibSp + Parch +
## Fare + Cabin + Embarked, data = training_data, method = "class",
## parms = list(split = "gini"))
##
## Variables actually used in tree construction:
## [1] Age Fare Pclass Sex SibSp
##
## Root node error: 206/534 = 0.38577
##
## n= 534
##
## CP nsplit rel error xerror xstd
## 1 0.475728 0 1.00000 1.00000 0.054605
## 2 0.029126 1 0.52427 0.52427 0.045059
## 3 0.024272 3 0.46602 0.54369 0.045670
## 4 0.019417 6 0.39320 0.52427 0.045059
## 5 0.010000 8 0.35437 0.49515 0.044096
Question 3
prediction_test <- predict(passenger_tree, newdata = training_data, type = "class")
prediction_test2 <- predict(passenger_tree, newdata = validation_data, type = "class")
training_data$predicted <- prediction_test
options(scipen = 999)
confusionMatrix(prediction_test, as.factor(training_data$Survived)) ## Confusion Matrix and Statistics
##
## Reference
## Prediction Did Not Survive Survived
## Did Not Survive 312 57
## Survived 16 149
##
## Accuracy : 0.8633
## 95% CI : (0.8312, 0.8913)
## No Information Rate : 0.6142
## P-Value [Acc > NIR] : < 0.00000000000000022
##
## Kappa : 0.7004
##
## Mcnemar's Test P-Value : 0.000002846
##
## Sensitivity : 0.9512
## Specificity : 0.7233
## Pos Pred Value : 0.8455
## Neg Pred Value : 0.9030
## Prevalence : 0.6142
## Detection Rate : 0.5843
## Detection Prevalence : 0.6910
## Balanced Accuracy : 0.8373
##
## 'Positive' Class : Did Not Survive
##
The confusion matrix shows that the classifier is accurate most of the time. The model is significant as the accuracy is higher than the no information rate. The high specificity shows that the model had good performance in predicting who actually died.